{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "189063f2",
   "metadata": {},
   "source": [
    "### Features of misexpression-associated variants \n",
    "\n",
    "* Curate a set of misexpression-associated variant features: \n",
    "    * MSC VEP consequence\n",
    "    * MSC Gene-level VEP consequence \n",
    "    * Position \n",
    "    * Genomic scores \n",
    "    * Functional scores \n",
    "    * Gene-sample information \n",
    "    * SV allele frequency and count - AC and AF \n",
    "    * Gene information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "83adfc3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b9886b32",
   "metadata": {},
   "outputs": [],
   "source": [
    "wkdir = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/\"\n",
    "wkdir_path = Path(wkdir)\n",
    "\n",
    "misexp_cntrl_sv_dir = wkdir_path.joinpath(\"5_misexp_vrnts/test_cntrl_sets\")\n",
    "\n",
    "# gene-level VEP MSC\n",
    "misexp_vrnt_gene_msc_path = misexp_cntrl_sv_dir.joinpath(\"misexp_vrnt_gene_msc_consq.tsv\")\n",
    "# all VEP MSC\n",
    "vep_msc_path = wkdir_path.joinpath(\"4_vrnt_enrich/sv_vep/msc/SV_vep_hg38_msc_parsed.tsv\")\n",
    "# variant position \n",
    "misexp_vrnt_position_path = misexp_cntrl_sv_dir.joinpath(\"misexp_vrnt_gene_position.tsv\")\n",
    "# genomic scores \n",
    "vrnt_scores_path = wkdir_path.joinpath(\"5_misexp_vrnts/scores/features/vrnt_features_scores.csv\")\n",
    "# functional scores \n",
    "vrnt_functional_scores_path = wkdir_path.joinpath(\"5_misexp_vrnts/functional/features/vrnt_features_reg_annot.csv\")\n",
    "# add count of misexpression events per SV-gene pair \n",
    "sv_info_path = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/data/sv_vcf/info_table/final_sites_critical_info_allele.txt\"\n",
    "# gene-level features\n",
    "inactive_gene_features_path = wkdir_path.joinpath(\"3_misexp_genes/inactive_gene_features_8650.csv\")\n",
    "# gene-sample information \n",
    "misexp_vrnt_gene_smpl_path = misexp_cntrl_sv_dir.joinpath(\"misexp_vrnt_gene_smpl.tsv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4d8b8bca",
   "metadata": {},
   "outputs": [],
   "source": [
    "out_dir = wkdir_path.joinpath(\"6_misexp_dissect/vrnt_features\")\n",
    "out_dir.mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e40e52d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load all files \n",
    "misexp_vrnt_gene_msc_df = pd.read_csv(misexp_vrnt_gene_msc_path, sep=\"\\t\").rename(columns={\"consequence\": \"gene_msc\"})\n",
    "vep_msc_df = pd.read_csv(vep_msc_path, sep=\"\\t\").rename(columns={\"Uploaded_variation\": \"vrnt_id\", \"Consequence\": \"msc\"})\n",
    "misexp_vrnt_position_df = pd.read_csv(misexp_vrnt_position_path, sep=\"\\t\")\n",
    "vrnt_scores_df = pd.read_csv(vrnt_scores_path, sep=\",\")\n",
    "vrnt_functional_scores_df = pd.read_csv(vrnt_functional_scores_path, sep=\",\")\n",
    "sv_info_df = pd.read_csv(sv_info_path, sep=\"\\t\", dtype={\"plinkID\": str}).rename(columns={\"plinkID\":\"vrnt_id\"})\n",
    "inactive_gene_features_df = pd.read_csv(inactive_gene_features_path, sep=\",\")\n",
    "misexp_vrnt_gene_smpl_df = pd.read_csv(misexp_vrnt_gene_smpl_path, sep=\"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "863665a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# features to include\n",
    "vrnt_scores_to_include = ['vrnt_id', 'CADD_sv_raw_score', 'phylop_max', \"gnomad_constraint_max_zscore\", 'gwrvis_min',\n",
    "                          'intersect_har']\n",
    "functional_scores_to_include = ['vrnt_id', 'gm12878_shared_intersect_tad_boundary', 'A_overlap', 'B_overlap', 'CTCFonlyCTCFbound_all',\n",
    "                                'CTCFonlyCTCFbound_CD14_monocyte', 'HighCTCF_B_cell', 'HighCTCF_Neutrophil',\n",
    "                                'intersect_cpg_isl', 'TssA', 'TssAFlnk', 'TxFlnk', 'Tx', 'TxWk', 'EnhG',\n",
    "                                'Enh', 'ZNFRpts', 'Het', 'TssBiv', 'BivFlnk', 'EnhBiv', 'ReprPC', 'ReprPCWk',\n",
    "                                'Quies']\n",
    "gene_features_to_include = ['gene_id', 'oncogene', 'Episcore', 'pHaplo', 'pTriplo','pLI','pNull', 'EDS', \n",
    "                            'oe_lof_upper', 'approved_target','decipher_gene','omim_gene']\n",
    "misexp_smpl_info_to_keep = [\"vrnt_id\", \"gene_id\", \"egan_id\", \"rna_id\", \"TPM\", \"z-score\", \"genotype\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "5f3b8117",
   "metadata": {},
   "outputs": [],
   "source": [
    "# add variant VEP MSC\n",
    "misexp_vrnt_vep_msc_df = pd.merge(misexp_vrnt_gene_msc_df, \n",
    "                                  vep_msc_df[[\"vrnt_id\", \"msc\"]], \n",
    "                                  on=\"vrnt_id\", \n",
    "                                  how=\"inner\")\n",
    "\n",
    "# add variant position relative to misexpressed gene \n",
    "misexp_vrnt_vep_msc_pos_df = pd.merge(misexp_vrnt_vep_msc_df,\n",
    "                                      misexp_vrnt_position_df.drop(columns=[\"SVTYPE\", \"consequence\"]), \n",
    "                                      on=[\"vrnt_id\", \"gene_id\"], \n",
    "                                      how=\"inner\"\n",
    "                                     )\n",
    "# add variant genomic scores\n",
    "misexp_vrnt_vep_msc_pos_scores_df = pd.merge(misexp_vrnt_vep_msc_pos_df, \n",
    "                                            vrnt_scores_df[vrnt_scores_to_include], \n",
    "                                             on=\"vrnt_id\", \n",
    "                                             how=\"inner\"\n",
    "                                            )\n",
    "# add variant functional scores \n",
    "misexp_vrnt_func_added_df = pd.merge(misexp_vrnt_vep_msc_pos_scores_df, \n",
    "                                            vrnt_functional_scores_df[functional_scores_to_include], \n",
    "                                             on=\"vrnt_id\", \n",
    "                                             how=\"inner\"\n",
    "                                            )\n",
    "# add SV info\n",
    "misexp_vrnt_maf_ac_added_df = pd.merge(misexp_vrnt_func_added_df, \n",
    "                                      sv_info_df[[\"vrnt_id\", \"AF\", \"AC\"]], \n",
    "                                       on=\"vrnt_id\", \n",
    "                                       how=\"inner\"\n",
    "                                      )\n",
    "# add misexpressed gene properties \n",
    "misexp_vrnt_gene_features_added_df = pd.merge(misexp_vrnt_maf_ac_added_df, \n",
    "                                      inactive_gene_features_df[gene_features_to_include], \n",
    "                                       on=\"gene_id\", \n",
    "                                       how=\"inner\"\n",
    "                                      )\n",
    "# add sample information\n",
    "misexp_vrnt_features_final_df = pd.merge(misexp_vrnt_gene_features_added_df, \n",
    "                                        misexp_vrnt_gene_smpl_df[misexp_smpl_info_to_keep], \n",
    "                                         on=[\"vrnt_id\", \"gene_id\"], \n",
    "                                         how=\"inner\"\n",
    "                                        )\n",
    "if misexp_vrnt_gene_smpl_df.shape[0] != misexp_vrnt_features_final_df.shape[0]: \n",
    "    raise ValueError(\"Input dataframe has different number of vrnt-gene-samples\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "37984a1d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write to file \n",
    "misexp_vrnt_features_final_path = out_dir.joinpath(f\"misexp_vrnt_features.tsv\")\n",
    "misexp_vrnt_features_final_df.to_csv(misexp_vrnt_features_final_path, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5b84ce90",
   "metadata": {},
   "source": [
    "**Check no sample swaps for bams**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "0b570312",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No swapped samples in dataframe.\n"
     ]
    }
   ],
   "source": [
    "swap_samples = {\"INT_RNA7879032\", \"INT_RNA7879033\", \"INT_RNA7960192\",\n",
    "                \"INT_RNA7960193\", \"INT_RNA7709692\", \"INT_RNA7709693\",\n",
    "                \"INT_RNA7710161\", \"INT_RNA7710162\", \"INT_RNA7710163\", \n",
    "                \"INT_RNA7710164\"}\n",
    "misexp_rna_ids = set(misexp_vrnt_features_final_df.rna_id.unique())\n",
    "if not misexp_rna_ids.intersection(swap_samples): \n",
    "    print(\"No swapped samples in dataframe.\")\n",
    "else:\n",
    "    print(f\"Swapped samples in dataframe: {misexp_rna_ids.intersection(swap_samples)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6c9f3f46",
   "metadata": {},
   "source": [
    "**EGAN IDs for DUP cram files**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "28cb2b6a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of misexpression-associated DUP carriers: 22\n"
     ]
    }
   ],
   "source": [
    "dup_misexp_egan_ids = misexp_vrnt_features_final_df[(misexp_vrnt_features_final_df.SVTYPE == \"DUP\")].egan_id.unique()\n",
    "print(f\"Number of misexpression-associated DUP carriers: {len(dup_misexp_egan_ids)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "a086955b",
   "metadata": {},
   "outputs": [],
   "source": [
    "dup_misexp_egan_id_path = out_dir.joinpath(\"misexp_dup_carriers_egan_ids.txt\")\n",
    "with open(dup_misexp_egan_id_path, \"w\") as f_out: \n",
    "    for egan_id in dup_misexp_egan_ids: \n",
    "        f_out.write(f\"{egan_id}\\n\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
